{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<torch._C.Generator at 0x10b52d390>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "\n",
    "torch.manual_seed(1234)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_to_ix = {\"data\": 0, \"science\": 1}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data': 0, 'science': 1}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word_to_ix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Embedding(2, 5)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "embeds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([0])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lookup_tensor = torch.tensor([word_to_ix[\"data\"]], dtype=torch.long)\n",
    "lookup_tensor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ 0.0461,  0.4024, -1.0115,  0.2167, -0.6123]], grad_fn=<EmbeddingBackward>)\n"
     ]
    }
   ],
   "source": [
    "hello_embed = embeds(lookup_tensor)\n",
    "print(hello_embed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "CONTEXT_SIZE = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "EMBEDDING_DIM = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(['The', 'popularity'], 'of'), (['popularity', 'of'], 'the'), (['of', 'the'], 'term')]\n"
     ]
    }
   ],
   "source": [
    "test_sentence = \"\"\"The popularity of the term \"data science\" has exploded in \n",
    "business environments and academia, as indicated by a jump in job openings.[32] \n",
    "However, many critical academics and journalists see no distinction between data \n",
    "science and statistics. Writing in Forbes, Gil Press argues that data science is a \n",
    "buzzword without a clear definition and has simply replaced “business analytics” in \n",
    "contexts such as graduate degree programs.[7] In the question-and-answer section of \n",
    "his keynote address at the Joint Statistical Meetings of American Statistical \n",
    "Association, noted applied statistician Nate Silver said, “I think data-scientist \n",
    "is a sexed up term for a statistician....Statistics is a branch of science. \n",
    "Data scientist is slightly redundant in some way and people shouldn’t berate the \n",
    "term statistician.”[9] Similarly, in business sector, multiple researchers and \n",
    "analysts state that data scientists alone are far from being sufficient in granting \n",
    "companies a real competitive advantage[33] and consider data scientists as only \n",
    "one of the four greater job families companies require to leverage big \n",
    "data effectively, namely: data analysts, data scientists, big data developers \n",
    "and big data engineers.[34]\n",
    "\n",
    "On the other hand, responses to criticism are as numerous. In a 2014 Wall Street \n",
    "Journal article, Irving Wladawsky-Berger compares the data science enthusiasm with \n",
    "the dawn of computer science. He argues data science, like any other interdisciplinary \n",
    "field, employs methodologies and practices from across the academia and industry, but \n",
    "then it will morph them into a new discipline. He brings to attention the sharp criticisms \n",
    "computer science, now a well respected academic discipline, had to once face.[35] Likewise, \n",
    "NYU Stern's Vasant Dhar, as do many other academic proponents of data science,[35] argues \n",
    "more specifically in December 2013 that data science is different from the existing practice \n",
    "of data analysis across all disciplines, which focuses only on explaining data sets. \n",
    "Data science seeks actionable and consistent pattern for predictive uses.[1] This practical \n",
    "engineering goal takes data science beyond traditional analytics. Now the data in those \n",
    "disciplines and applied fields that lacked solid theories, like health science and social \n",
    "science, could be sought and utilized to generate powerful predictive models.[1]\"\"\".split()\n",
    "# we should tokenize the input, but we will ignore that for now\n",
    "# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)\n",
    "trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])\n",
    "            for i in range(len(test_sentence) - 2)]\n",
    "# print the first 3, just so you can see what they look like\n",
    "print(trigrams[:3])\n",
    "\n",
    "vocab = set(test_sentence)\n",
    "word_to_ix = {word: i for i, word in enumerate(vocab)}\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "class NGramLanguageModeler(nn.Module):\n",
    "\n",
    "    def __init__(self, vocab_size, embedding_dim, context_size):\n",
    "        super(NGramLanguageModeler, self).__init__()\n",
    "        self.embeddings = nn.Embedding(vocab_size, embedding_dim)\n",
    "        self.linear1 = nn.Linear(context_size * embedding_dim, 128)\n",
    "        self.linear2 = nn.Linear(128, vocab_size)\n",
    "\n",
    "    def forward(self, inputs):\n",
    "        embeds = self.embeddings(inputs).view((1, -1))\n",
    "        out = F.relu(self.linear1(embeds))\n",
    "        out = self.linear2(out)\n",
    "        log_probs = F.log_softmax(out, dim=1)\n",
    "        return log_probs\n",
    "\n",
    "\n",
    "losses = []\n",
    "loss_function = nn.NLLLoss()\n",
    "model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)\n",
    "optimizer = optim.SGD(model.parameters(), lr=0.001)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "losses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NLLLoss()"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loss_function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "NGramLanguageModeler(\n",
       "  (embeddings): Embedding(228, 10)\n",
       "  (linear1): Linear(in_features=20, out_features=128, bias=True)\n",
       "  (linear2): Linear(in_features=128, out_features=228, bias=True)\n",
       ")"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SGD (\n",
       "Parameter Group 0\n",
       "    dampening: 0\n",
       "    lr: 0.001\n",
       "    momentum: 0\n",
       "    nesterov: False\n",
       "    weight_decay: 0\n",
       ")"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "optimizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1870.7198195457458, 1857.1877717971802, 1843.8901081085205, 1830.8134832382202, 1817.9432640075684, 1805.2666230201721, 1792.780930519104, 1780.4787874221802, 1768.3613789081573, 1756.433236837387]\n"
     ]
    }
   ],
   "source": [
    "for epoch in range(10):\n",
    "    total_loss = 0\n",
    "    for context, target in trigrams:\n",
    "\n",
    "        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words\n",
    "        # into integer indices and wrap them in tensors)\n",
    "        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)\n",
    "\n",
    "        # Step 2. Recall that torch *accumulates* gradients. Before passing in a\n",
    "        # new instance, you need to zero out the gradients from the old\n",
    "        # instance\n",
    "        model.zero_grad()\n",
    "\n",
    "        # Step 3. Run the forward pass, getting log probabilities over next\n",
    "        # words\n",
    "        log_probs = model(context_idxs)\n",
    "\n",
    "        # Step 4. Compute your loss function. (Again, Torch wants the target\n",
    "        # word wrapped in a tensor)\n",
    "        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))\n",
    "\n",
    "        # Step 5. Do the backward pass and update the gradient\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "        # Get the Python number from a 1-element Tensor by calling tensor.item()\n",
    "        total_loss += loss.item()\n",
    "    losses.append(total_loss)\n",
    "print(losses)  # The loss decreased every iteration over the training data!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_text = \"\"\"For the future of data science, Donoho projects an ever-growing \n",
    "environment for open science where data sets used for academic publications are \n",
    "accessible to all researchers.[36] US National Institute of Health has already announced \n",
    "plans to enhance reproducibility and transparency of research data.[39] Other big journals \n",
    "are likewise following suit.[40][41] This way, the future of data science not only exceeds \n",
    "the boundary of statistical theories in scale and methodology, but data science will \n",
    "revolutionize current academia and research paradigms.[36] As Donoho concludes, \"the scope \n",
    "and impact of data science will continue to expand enormously in coming decades as scientific \n",
    "data and data about science itself become ubiquitously available.\"[36]\"\"\".split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(['For', 'the', 'of', 'data'], 'future'), (['the', 'future', 'data', 'science,'], 'of'), (['future', 'of', 'science,', 'Donoho'], 'data'), (['of', 'data', 'Donoho', 'projects'], 'science,'), (['data', 'science,', 'projects', 'an'], 'Donoho')]\n"
     ]
    }
   ],
   "source": [
    "# By deriving a set from `raw_text`, we deduplicate the array\n",
    "vocab = set(raw_text)\n",
    "vocab_size = len(vocab)\n",
    "\n",
    "word_to_ix = {word: i for i, word in enumerate(vocab)}\n",
    "data = []\n",
    "for i in range(2, len(raw_text) - 2):\n",
    "    context = [raw_text[i - 2], raw_text[i - 1],\n",
    "               raw_text[i + 1], raw_text[i + 2]]\n",
    "    target = raw_text[i]\n",
    "    data.append((context, target))\n",
    "print(data[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([32,  0, 49, 17])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class CBOW(nn.Module):\n",
    "\n",
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
    "    def forward(self, inputs):\n",
    "        pass\n",
    "# create your model and train.  here are some functions to help you make\n",
    "# the data ready for use by your module\n",
    "def make_context_vector(context, word_to_ix):\n",
    "    idxs = [word_to_ix[w] for w in context]\n",
    "    return torch.tensor(idxs, dtype=torch.long)\n",
    "\n",
    "make_context_vector(data[0][0], word_to_ix)  # example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[-0.7850,  0.8883,  1.1011],\n",
      "        [ 0.3344, -0.3598,  0.5535]], grad_fn=<ThAddmmBackward>)\n"
     ]
    }
   ],
   "source": [
    "lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b\n",
    "# data is 2x5.  A maps from 5 to 3... can we map \"data\" under A?\n",
    "data = torch.randn(2, 5)\n",
    "print(lin(data))  # yes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ 1.6053, -0.1710],\n",
      "        [ 1.4815, -1.1123]])\n",
      "tensor([[1.6053, 0.0000],\n",
      "        [1.4815, 0.0000]])\n"
     ]
    }
   ],
   "source": [
    "data = torch.randn(2, 2)\n",
    "print(data)\n",
    "print(F.relu(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([-0.4417, -2.5164, -0.2034, -2.1575, -1.2533])\n",
      "tensor([0.3313, 0.0416, 0.4204, 0.0596, 0.1471])\n",
      "tensor(1.)\n",
      "tensor([-1.1048, -3.1795, -0.8665, -2.8206, -1.9164])\n"
     ]
    }
   ],
   "source": [
    "# Softmax is also in torch.nn.functional\n",
    "data = torch.randn(5)\n",
    "print(data)\n",
    "print(F.softmax(data, dim=0))\n",
    "print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!\n",
    "print(F.log_softmax(data, dim=0))  # theres also log_softmax"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[[-0.1500,  0.0547,  0.3930]],\n",
      "\n",
      "        [[-0.1313, -0.0478,  0.0857]],\n",
      "\n",
      "        [[-0.1131,  0.0047, -0.1003]],\n",
      "\n",
      "        [[ 0.0176, -0.2464, -0.1589]],\n",
      "\n",
      "        [[-0.0523,  0.1781, -0.1713]]], grad_fn=<CatBackward>)\n",
      "(tensor([[[-0.0523,  0.1781, -0.1713]]], grad_fn=<ViewBackward>), tensor([[[-0.1997,  0.5137, -0.6064]]], grad_fn=<ViewBackward>))\n"
     ]
    }
   ],
   "source": [
    "lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3\n",
    "inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5\n",
    "\n",
    "# initialize the hidden state.\n",
    "hidden = (torch.randn(1, 1, 3),\n",
    "          torch.randn(1, 1, 3))\n",
    "for i in inputs:\n",
    "    # Step through the sequence one element at a time.\n",
    "    # after each step, hidden contains the hidden state.\n",
    "    out, hidden = lstm(i.view(1, 1, -1), hidden)\n",
    "inputs = torch.cat(inputs).view(len(inputs), 1, -1)\n",
    "hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state\n",
    "out, hidden = lstm(inputs, hidden)\n",
    "print(out)\n",
    "print(hidden)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_sequence(seq, to_ix):\n",
    "    idxs = [to_ix[w] for w in seq]\n",
    "    return torch.tensor(idxs, dtype=torch.long)\n",
    "\n",
    "\n",
    "training_data = [\n",
    "    (\"Probability and random variable are integral part of computation \".split(), \n",
    "     [\"DET\", \"NN\", \"V\", \"DET\", \"NN\"]),\n",
    "    (\"Understanding of the probability and associated concepts are essential\".split(), \n",
    "     [\"NN\", \"V\", \"DET\", \"NN\"])\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(['Probability',\n",
       "   'and',\n",
       "   'random',\n",
       "   'variable',\n",
       "   'are',\n",
       "   'integral',\n",
       "   'part',\n",
       "   'of',\n",
       "   'computation'],\n",
       "  ['DET', 'NN', 'V', 'DET', 'NN']),\n",
       " (['Understanding',\n",
       "   'of',\n",
       "   'the',\n",
       "   'probability',\n",
       "   'and',\n",
       "   'associated',\n",
       "   'concepts',\n",
       "   'are',\n",
       "   'essential'],\n",
       "  ['NN', 'V', 'DET', 'NN'])]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'Probability': 0, 'and': 1, 'random': 2, 'variable': 3, 'are': 4, 'integral': 5, 'part': 6, 'of': 7, 'computation': 8, 'Understanding': 9, 'the': 10, 'probability': 11, 'associated': 12, 'concepts': 13, 'essential': 14}\n"
     ]
    }
   ],
   "source": [
    "word_to_ix = {}\n",
    "for sent, tags in training_data:\n",
    "    for word in sent:\n",
    "        if word not in word_to_ix:\n",
    "            word_to_ix[word] = len(word_to_ix)\n",
    "print(word_to_ix)\n",
    "tag_to_ix = {\"DET\": 0, \"NN\": 1, \"V\": 2}\n",
    "\n",
    "EMBEDDING_DIM = 6\n",
    "HIDDEN_DIM = 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "class LSTMTagger(nn.Module):\n",
    "\n",
    "    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):\n",
    "        super(LSTMTagger, self).__init__()\n",
    "        self.hidden_dim = hidden_dim\n",
    "\n",
    "        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)\n",
    "\n",
    "        # The LSTM takes word embeddings as inputs, and outputs hidden states\n",
    "        # with dimensionality hidden_dim.\n",
    "        self.lstm = nn.LSTM(embedding_dim, hidden_dim)\n",
    "\n",
    "        # The linear layer that maps from hidden state space to tag space\n",
    "        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)\n",
    "        self.hidden = self.init_hidden()\n",
    "\n",
    "    def init_hidden(self):\n",
    "        # Before we've done anything, we dont have any hidden state.\n",
    "        # Refer to the Pytorch documentation to see exactly\n",
    "        # why they have this dimensionality.\n",
    "        # The axes semantics are (num_layers, minibatch_size, hidden_dim)\n",
    "        return (torch.zeros(1, 1, self.hidden_dim),\n",
    "                torch.zeros(1, 1, self.hidden_dim))\n",
    "\n",
    "    def forward(self, sentence):\n",
    "        embeds = self.word_embeddings(sentence)\n",
    "        lstm_out, self.hidden = self.lstm(\n",
    "            embeds.view(len(sentence), 1, -1), self.hidden)\n",
    "        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))\n",
    "        tag_scores = F.log_softmax(tag_space, dim=1)\n",
    "        return tag_scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SGD (\n",
       "Parameter Group 0\n",
       "    dampening: 0\n",
       "    lr: 0.1\n",
       "    momentum: 0\n",
       "    nesterov: False\n",
       "    weight_decay: 0\n",
       ")"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))\n",
    "loss_function = nn.NLLLoss()\n",
    "optimizer = optim.SGD(model.parameters(), lr=0.1)\n",
    "model\n",
    "loss_function\n",
    "optimizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[-1.0414, -1.1928, -1.0680],\n",
      "        [-1.0747, -1.2163, -1.0154],\n",
      "        [-1.0706, -1.2298, -1.0083],\n",
      "        [-1.0661, -1.2428, -1.0022],\n",
      "        [-1.0013, -1.2948, -1.0254],\n",
      "        [-1.0539, -1.2640, -0.9973],\n",
      "        [-1.0718, -1.2705, -0.9757],\n",
      "        [-0.9919, -1.2527, -1.0689],\n",
      "        [-0.9726, -1.2880, -1.0611]])\n"
     ]
    }
   ],
   "source": [
    "with torch.no_grad():\n",
    "    inputs = prepare_sequence(training_data[0][0], word_to_ix)\n",
    "    tag_scores = model(inputs)\n",
    "    print(tag_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Expected input batch_size (9) to match target batch_size (5).",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-31-dc168b4b74e1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     19\u001b[0m         \u001b[0;31m# Step 4. Compute the loss, gradients, and update the parameters by\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     20\u001b[0m         \u001b[0;31m#  calling optimizer.step()\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m         \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloss_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtag_scores\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtargets\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     22\u001b[0m         \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     23\u001b[0m         \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m    475\u001b[0m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    476\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 477\u001b[0;31m             \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    478\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    479\u001b[0m             \u001b[0mhook_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/lib/python3.6/site-packages/torch/nn/modules/loss.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input, target)\u001b[0m\n\u001b[1;32m    198\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    199\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 200\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnll_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mignore_index\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduction\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    201\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    202\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Applications/anaconda3/lib/python3.6/site-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mnll_loss\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[1;32m   1403\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1404\u001b[0m         raise ValueError('Expected input batch_size ({}) to match target batch_size ({}).'\n\u001b[0;32m-> 1405\u001b[0;31m                          .format(input.size(0), target.size(0)))\n\u001b[0m\u001b[1;32m   1406\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mdim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1407\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_nn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnll_loss\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_enum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreduction\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Expected input batch_size (9) to match target batch_size (5)."
     ]
    }
   ],
   "source": [
    "for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data\n",
    "    for sentence, tags in training_data:\n",
    "        # Step 1. Remember that Pytorch accumulates gradients.\n",
    "        # We need to clear them out before each instance\n",
    "        model.zero_grad()\n",
    "\n",
    "        # Also, we need to clear out the hidden state of the LSTM,\n",
    "        # detaching it from its history on the last instance.\n",
    "        model.hidden = model.init_hidden()\n",
    "\n",
    "        # Step 2. Get our inputs ready for the network, that is, turn them into\n",
    "        # Tensors of word indices.\n",
    "        sentence_in = prepare_sequence(sentence, word_to_ix)\n",
    "        targets = prepare_sequence(tags, tag_to_ix)\n",
    "\n",
    "        # Step 3. Run our forward pass.\n",
    "        tag_scores = model(sentence_in)\n",
    "\n",
    "        # Step 4. Compute the loss, gradients, and update the parameters by\n",
    "        #  calling optimizer.step()\n",
    "        loss = loss_function(tag_scores, targets)\n",
    "        loss.backward()\n",
    "        optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[-0.9758, -1.2959, -1.0513],\n",
      "        [-1.0012, -1.2932, -1.0267],\n",
      "        [-1.0178, -1.2790, -1.0208],\n",
      "        [-1.0394, -1.2699, -1.0066],\n",
      "        [-0.9647, -1.3276, -1.0391],\n",
      "        [-1.0317, -1.2859, -1.0019],\n",
      "        [-1.0524, -1.2877, -0.9808],\n",
      "        [-0.9821, -1.2619, -1.0719],\n",
      "        [-0.9645, -1.2950, -1.0644]])\n"
     ]
    }
   ],
   "source": [
    "# See what the scores are after training\n",
    "with torch.no_grad():\n",
    "    inputs = prepare_sequence(training_data[0][0], word_to_ix)\n",
    "    tag_scores = model(inputs)\n",
    "    print(tag_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
